library(corrplot)
library(cluster)
library(factoextra)
library(reshape2)
library(tidyverse)
library(RColorBrewer)
library(scales)
library(fpc)
library(EnvStats)
library(NbClust)
library(dendextend)
library(gtools)
library(ClusterR)
library(dendextend)
The dataset is related to variety of wine variants. The goal is to model wine quality based on physicochemical tests.
These datasets can be viewed as classification or regression tasks. The classes are ordered and not balanced (e.g. there are many more normal wines than excellent or poor ones). Outlier detection algorithms could be used to detect the few excellent or poor wines. Also, we are not sure if all input variables are relevant. So it could be interesting to test feature selection methods.
The data set contains 18 variables that involve wine realted attributes.
| Column Name | Description |
|---|---|
| 1 - fixed acidity | fixed acidity of the wine |
| 2 - volatile acidity | volatile acidity of the wine |
| 3 - citric acid | citric acid in wine observation |
| 4 - residual sugar | residual sugar in wine |
| 5 - chlorides | cholorides in wine |
| 6 - free sulfur dioxide | free sulfur dioxide |
| 7 - total sulfur dioxide | total sulfur dioxide |
| 8 - density | Wine density |
| 9 - pH | Wine pH value |
| 10 - sulphates | sulphates in wine |
| 11 - alcohol | Wine alchocoal level |
| 12 - #add rowid (0 to 10) | wien quality based on based on sensory data) |
| 13 - country | country origin of wine |
| 14 - province | province origin of wine |
| 15 - region_1 | Region of wine origin |
| 16 - region_2 | sub-region of wine origin |
| 17 - variety | Type of wine |
| 18 - winery | Name of the winery |
Read data
#Read data
raw <- read.csv('./data/wine-with-type-location.csv', header=TRUE, sep = ";")
Add rowid
#add rowid
raw <- tibble::rowid_to_column(raw, "ROWID")
head(raw, 5)
## ROWID fixed.acidity volatile.acidity citric.acid residual.sugar
## 1 1 7.4 0.70 0.00 1.9
## 2 2 7.8 0.88 0.00 2.6
## 3 3 7.8 0.76 0.04 2.3
## 4 4 11.2 0.28 0.56 1.9
## 5 5 7.4 0.70 0.00 1.9
## chlorides free.sulfur.dioxide total.sulfur.dioxide density pH
## 1 0.076 11 34 0.9978 3.51
## 2 0.098 25 67 0.9968 3.20
## 3 0.092 15 54 0.9970 3.26
## 4 0.075 17 60 0.9980 3.16
## 5 0.076 11 34 0.9978 3.51
## sulphates alcohol quality country province region_1
## 1 0.56 9.4 5 Italy Sicily & Sardinia Etna
## 2 0.68 9.8 5 Portugal Douro
## 3 0.65 9.8 5 US Oregon Willamette Valley
## 4 0.58 9.8 6 US Michigan Lake Michigan Shore
## 5 0.56 9.4 5 US Oregon Willamette Valley
## region_2 variety winery
## 1 White Blend Nicosia
## 2 Portuguese Red Quinta dos Avidagos
## 3 Willamette Valley Pinot Gris Rainstorm
## 4 Riesling St. Julian
## 5 Willamette Valley Pinot Noir Sweet Cheeks
# summary(raw)
#numeric data, 'quality' variable is dependent variable
data <- raw[c(1:12)]
summary(data)
## ROWID fixed.acidity volatile.acidity citric.acid
## Min. : 1.0 Min. : 4.60 Min. :0.1200 Min. :0.000
## 1st Qu.: 400.5 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090
## Median : 800.0 Median : 7.90 Median :0.5200 Median :0.260
## Mean : 800.0 Mean : 8.32 Mean :0.5278 Mean :0.271
## 3rd Qu.:1199.5 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420
## Max. :1599.0 Max. :15.90 Max. :1.5800 Max. :1.000
## residual.sugar chlorides free.sulfur.dioxide
## Min. : 0.900 Min. :0.01200 Min. : 1.00
## 1st Qu.: 1.900 1st Qu.:0.07000 1st Qu.: 7.00
## Median : 2.200 Median :0.07900 Median :14.00
## Mean : 2.539 Mean :0.08747 Mean :15.87
## 3rd Qu.: 2.600 3rd Qu.:0.09000 3rd Qu.:21.00
## Max. :15.500 Max. :0.61100 Max. :72.00
## total.sulfur.dioxide density pH sulphates
## Min. : 6.00 Min. :0.9901 Min. :2.740 Min. :0.3300
## 1st Qu.: 22.00 1st Qu.:0.9956 1st Qu.:3.210 1st Qu.:0.5500
## Median : 38.00 Median :0.9968 Median :3.310 Median :0.6200
## Mean : 46.47 Mean :0.9967 Mean :3.311 Mean :0.6581
## 3rd Qu.: 62.00 3rd Qu.:0.9978 3rd Qu.:3.400 3rd Qu.:0.7300
## Max. :289.00 Max. :1.0037 Max. :4.010 Max. :2.0000
## alcohol
## Min. : 8.40
## 1st Qu.: 9.50
## Median :10.20
## Mean :10.42
## 3rd Qu.:11.10
## Max. :14.90
No NAs or Nans or blanks found
str(data)
## 'data.frame': 1599 obs. of 12 variables:
## $ ROWID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
nrow(data)
## [1] 1599
#Melt data
melt_data = melt(data, id.vars=c("ROWID"))
#visualize spread of data
ggplot(melt_data, mapping = aes(x = value)) + geom_bar(fill = "#FF6666") + facet_wrap(~variable, scales = 'free_x')
boxplot(data[,-c(1)])
Above shows data needs scaling and has outliers
#3 outlier
rosnerTest(data$volatile.acidity, k = 4, warn = F)
##
## Results of Outlier Test
## -------------------------
##
## Test Method: Rosner's Test for Outliers
##
## Hypothesized Distribution: Normal
##
## Data: data$volatile.acidity
##
## Sample Size: 1599
##
## Test Statistics: R.1 = 5.876138
## R.2 = 4.531485
## R.3 = 4.562348
## R.4 = 4.079513
##
## Test Statistic Parameter: k = 4
##
## Alternative Hypothesis: Up to 4 observations are not
## from the same Distribution.
##
## Type I Error: 5%
##
## Number of Outliers Detected: 3
##
## i Mean.i SD.i Value Obs.Num R.i+1 lambda.i+1 Outlier
## 1 0 0.5278205 0.1790597 1.58 1300 5.876138 4.153388 TRUE
## 2 1 0.5271621 0.1771688 1.33 127 4.531485 4.153240 TRUE
## 3 2 0.5266594 0.1760805 1.33 128 4.562348 4.153091 TRUE
## 4 3 0.5261560 0.1749827 1.24 673 4.079513 4.152943 FALSE
#0 outliers
rosnerTest(data$citric.acid, k = 4, warn = F)
##
## Results of Outlier Test
## -------------------------
##
## Test Method: Rosner's Test for Outliers
##
## Hypothesized Distribution: Normal
##
## Data: data$citric.acid
##
## Sample Size: 1599
##
## Test Statistics: R.1 = 3.742403
## R.2 = 2.677655
## R.3 = 2.632885
## R.4 = 2.535969
##
## Test Statistic Parameter: k = 4
##
## Alternative Hypothesis: Up to 4 observations are not
## from the same Distribution.
##
## Type I Error: 5%
##
## Number of Outliers Detected: 0
##
## i Mean.i SD.i Value Obs.Num R.i+1 lambda.i+1 Outlier
## 1 0 0.2709756 0.1948011 1.00 152 3.742403 4.153388 FALSE
## 2 1 0.2705194 0.1940058 0.79 354 2.677655 4.153240 FALSE
## 3 2 0.2701941 0.1936301 0.78 1575 2.632885 4.153091 FALSE
## 4 3 0.2698747 0.1932695 0.76 259 2.535969 4.152943 FALSE
#4 outliers
rosnerTest(data$residual.sugar, k = 4, warn = F)
##
## Results of Outlier Test
## -------------------------
##
## Test Method: Rosner's Test for Outliers
##
## Hypothesized Distribution: Normal
##
## Data: data$residual.sugar
##
## Sample Size: 1599
##
## Test Statistics: R.1 = 9.192806
## R.2 = 9.376226
## R.3 = 9.648666
## R.4 = 8.788466
##
## Test Statistic Parameter: k = 4
##
## Alternative Hypothesis: Up to 4 observations are not
## from the same Distribution.
##
## Type I Error: 5%
##
## Number of Outliers Detected: 4
##
## i Mean.i SD.i Value Obs.Num R.i+1 lambda.i+1 Outlier
## 1 0 2.538806 1.409928 15.5 481 9.192806 4.153388 TRUE
## 2 1 2.530695 1.372546 15.4 1435 9.376226 4.153240 TRUE
## 3 2 2.522636 1.334626 15.4 1436 9.648666 4.153091 TRUE
## 4 3 2.514568 1.295497 13.9 1575 8.788466 4.152943 TRUE
#4 outliers
rosnerTest(data$chlorides, k = 4, warn = F)
##
## Results of Outlier Test
## -------------------------
##
## Test Method: Rosner's Test for Outliers
##
## Hypothesized Distribution: Normal
##
## Data: data$chlorides
##
## Sample Size: 1599
##
## Test Statistics: R.1 = 11.123555
## R.2 = 11.562755
## R.3 = 8.780833
## R.4 = 8.932900
##
## Test Statistic Parameter: k = 4
##
## Alternative Hypothesis: Up to 4 observations are not
## from the same Distribution.
##
## Type I Error: 5%
##
## Number of Outliers Detected: 4
##
## i Mean.i SD.i Value Obs.Num R.i+1 lambda.i+1 Outlier
## 1 0 0.08746654 0.04706530 0.611 259 11.123555 4.153388 TRUE
## 2 1 0.08713892 0.04521942 0.610 152 11.562755 4.153240 TRUE
## 3 2 0.08681152 0.04329754 0.467 107 8.780833 4.153091 TRUE
## 4 3 0.08657331 0.04225130 0.464 82 8.932900 4.152943 TRUE
#4 outliers
rosnerTest(data$free.sulfur.dioxide, k = 4, warn = F)
##
## Results of Outlier Test
## -------------------------
##
## Test Method: Rosner's Test for Outliers
##
## Hypothesized Distribution: Normal
##
## Data: data$free.sulfur.dioxide
##
## Sample Size: 1599
##
## Test Statistics: R.1 = 5.365606
## R.2 = 5.030550
## R.3 = 5.072499
## R.4 = 4.919615
##
## Test Statistic Parameter: k = 4
##
## Alternative Hypothesis: Up to 4 observations are not
## from the same Distribution.
##
## Type I Error: 5%
##
## Number of Outliers Detected: 4
##
## i Mean.i SD.i Value Obs.Num R.i+1 lambda.i+1 Outlier
## 1 0 15.87492 10.46016 72 1245 5.365606 4.153388 TRUE
## 2 1 15.83980 10.36869 68 397 5.030550 4.153240 TRUE
## 3 2 15.80714 10.28938 68 401 5.072499 4.153091 TRUE
## 4 3 15.77444 10.20925 66 1559 4.919615 4.152943 TRUE
#0 outliers
rosnerTest(data$density, k = 4, warn = F)
##
## Results of Outlier Test
## -------------------------
##
## Test Method: Rosner's Test for Outliers
##
## Hypothesized Distribution: Normal
##
## Data: data$density
##
## Sample Size: 1599
##
## Test Statistics: R.1 = 3.678904
## R.2 = 3.695748
## R.3 = 3.561134
## R.4 = 3.576495
##
## Test Statistic Parameter: k = 4
##
## Alternative Hypothesis: Up to 4 observations are not
## from the same Distribution.
##
## Type I Error: 5%
##
## Number of Outliers Detected: 0
##
## i Mean.i SD.i Value Obs.Num R.i+1 lambda.i+1 Outlier
## 1 0 0.9967467 0.001887334 1.00369 1435 3.678904 4.153388 FALSE
## 2 1 0.9967423 0.001879908 1.00369 1436 3.695748 4.153240 FALSE
## 3 2 0.9967380 0.001872433 0.99007 1018 3.561134 4.153091 FALSE
## 4 3 0.9967422 0.001865559 0.99007 1019 3.576495 4.152943 FALSE
#2 outliers
rosnerTest(data$pH, k = 4, warn = F)
##
## Results of Outlier Test
## -------------------------
##
## Test Method: Rosner's Test for Outliers
##
## Hypothesized Distribution: Normal
##
## Data: data$pH
##
## Sample Size: 1599
##
## Test Statistics: R.1 = 4.526866
## R.2 = 4.557617
## R.3 = 3.867629
## R.4 = 3.887110
##
## Test Statistic Parameter: k = 4
##
## Alternative Hypothesis: Up to 4 observations are not
## from the same Distribution.
##
## Type I Error: 5%
##
## Number of Outliers Detected: 2
##
## i Mean.i SD.i Value Obs.Num R.i+1 lambda.i+1 Outlier
## 1 0 3.311113 0.1543865 4.01 1317 4.526866 4.153388 TRUE
## 2 1 3.310676 0.1534408 4.01 1322 4.557617 4.153240 TRUE
## 3 2 3.310238 0.1524867 3.90 46 3.867629 4.153091 FALSE
## 4 3 3.309868 0.1518176 3.90 696 3.887110 4.152943 FALSE
#4 outliers
rosnerTest(data$sulphates, k = 4, warn = F)
##
## Results of Outlier Test
## -------------------------
##
## Test Method: Rosner's Test for Outliers
##
## Hypothesized Distribution: Normal
##
## Data: data$sulphates
##
## Sample Size: 1599
##
## Test Statistics: R.1 = 7.916200
## R.2 = 7.958429
## R.3 = 7.939605
## R.4 = 8.103844
##
## Test Statistic Parameter: k = 4
##
## Alternative Hypothesis: Up to 4 observations are not
## from the same Distribution.
##
## Type I Error: 5%
##
## Number of Outliers Detected: 4
##
## i Mean.i SD.i Value Obs.Num R.i+1 lambda.i+1 Outlier
## 1 0 0.6581488 0.1695070 2.00 152 7.916200 4.153388 TRUE
## 2 1 0.6573091 0.1662000 1.98 93 7.958429 4.153240 TRUE
## 3 2 0.6564809 0.1629198 1.95 87 7.939605 4.153091 TRUE
## 4 3 0.6556704 0.1597180 1.95 92 8.103844 4.152943 TRUE
#1 outlier
rosnerTest(data$alcohol, k = 4, warn = F)
##
## Results of Outlier Test
## -------------------------
##
## Test Method: Rosner's Test for Outliers
##
## Hypothesized Distribution: Normal
##
## Data: data$alcohol
##
## Sample Size: 1599
##
## Test Statistics: R.1 = 4.201138
## R.2 = 3.376887
## R.3 = 3.390076
## R.4 = 3.403421
##
## Test Statistic Parameter: k = 4
##
## Alternative Hypothesis: Up to 4 observations are not
## from the same Distribution.
##
## Type I Error: 5%
##
## Number of Outliers Detected: 1
##
## i Mean.i SD.i Value Obs.Num R.i+1 lambda.i+1 Outlier
## 1 0 10.42298 1.065668 14.9 653 4.201138 4.153388 TRUE
## 2 1 10.42018 1.060094 14.0 143 3.376887 4.153240 FALSE
## 3 2 10.41794 1.056631 14.0 145 3.390076 4.153091 FALSE
## 4 3 10.41570 1.053148 14.0 468 3.403421 4.152943 FALSE
#corr of data
corrmatrix <- cor(data)
corrplot(corrmatrix, method = 'pie', type="upper")
Moderate corr of 0.6ish for following 1) fixed acidity - citric acid 2) fixed acidity - density 3) free.sulfur.dioxide - total.sulfur.dioxide
#we will remove fixed acidity and total.sulfur
data <- data[-c(1,7)]
#above are visualized and validated via plot
plot(data)
We replaced outliers with 5th and 95th percentile values.
#replace outliers with 5th and 95th percentile values
#remember An outlier is not any point over the 95th percentile
#or below the 5th percentile. Instead, an outlier is considered so
#if it is below the first quartile – 1.5·IQR or above third quartile + 1.5·IQR.
capOutlier <- function(x){
qnt <- quantile(x, probs=c(.25, .75), na.rm = T)
caps <- quantile(x, probs=c(.05, .95), na.rm = T)
H <- 1.5 * IQR(x, na.rm = T)
x[x < (qnt[1] - H)] <- caps[1]
x[x > (qnt[2] + H)] <- caps[2]
return(x)
}
data$volatile.acidity=capOutlier(data$volatile.acidity)
data$residual.sugar=capOutlier(data$residual.sugar)
data$chlorides=capOutlier(data$chlorides)
data$total.sulfur.dioxide=capOutlier(data$total.sulfur.dioxide)
data$density=capOutlier(data$density)
data$pH=capOutlier(data$pH)
data$sulphates=capOutlier(data$sulphates)
data$alcohol=capOutlier(data$alcohol)
#scale data
data_scaled <- data.frame(scale(data))
#visualize how well data was scaled
boxplot(data_scaled)
#scaling looks good
silhouette_score <- function(k){
km <- kmeans(data, centers = k, nstart=25)
ss <- silhouette(km$cluster, dist(data_scaled))
mean(ss[, 3])
}
k <- 2:10
avg_sil <- sapply(k, silhouette_score)
plot(k, type='b', avg_sil, xlab='Number of clusters', ylab='Average Silhouette Scores', frame=FALSE)
fviz_nbclust(data_scaled, kmeans, method='silhouette')
# ##### Build pca using princomp
data_pca1 <- princomp(data_scaled)
#examine the importance of PCs
summary(data_pca1)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.7683466 1.3882991 1.0542077 0.99435124 0.94105185
## Proportion of Variance 0.3129006 0.1928580 0.1112049 0.09893531 0.08861328
## Cumulative Proportion 0.3129006 0.5057587 0.6169636 0.71589893 0.80451221
## Comp.6 Comp.7 Comp.8 Comp.9
## Standard deviation 0.83825050 0.74793169 0.6311535 0.47081888
## Proportion of Variance 0.07031036 0.05597519 0.0398604 0.02218091
## Cumulative Proportion 0.87482257 0.93079776 0.9706582 0.99283907
## Comp.10
## Standard deviation 0.267515433
## Proportion of Variance 0.007160929
## Cumulative Proportion 1.000000000
# inspect principal components
# loadings shows variance and and how much each variable contributes to each components
loadings(data_pca1)
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## fixed.acidity 0.497 0.147 0.233 0.132 0.270
## volatile.acidity -0.231 -0.480 0.307 -0.101 0.645
## citric.acid 0.457 0.263 -0.112 0.141 -0.136
## residual.sugar 0.176 -0.141 -0.757 0.304 0.238 -0.154
## chlorides 0.234 -0.346 -0.482 -0.655 -0.343
## total.sulfur.dioxide -0.288 -0.343 -0.766 0.280 -0.114 0.181
## density 0.410 -0.342 0.127 0.482
## pH -0.422 -0.262 -0.275 0.393 -0.343
## sulphates 0.218 0.268 -0.235 -0.270 -0.710 0.199 0.378
## alcohol -0.103 0.535 -0.390 0.249 -0.314 0.233
## Comp.8 Comp.9 Comp.10
## fixed.acidity 0.264 0.353 0.618
## volatile.acidity 0.192 -0.389
## citric.acid 0.348 -0.737
## residual.sugar -0.400 0.163
## chlorides 0.133 0.114
## total.sulfur.dioxide 0.229 0.180
## density 0.253 0.195 -0.593
## pH 0.537 0.338
## sulphates -0.241
## alcohol 0.363 0.286 -0.342
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
## Proportion Var 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1
## Cumulative Var 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8
## Comp.9 Comp.10
## SS loadings 1.0 1.0
## Proportion Var 0.1 0.1
## Cumulative Var 0.9 1.0
fviz_pca_var(data_pca1,
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
Above shows that first 8 give us 89% of the variance We don’t see one variable being overbearing
#plot
plot(data_pca1)
# scree plot
plot(data_pca1, type = "lines")
#biplot
biplot(data_pca1, col = c("gray", "black"))
# ###### using princomp first 10 give us 90% of the variance
# ##### build pca using prcomp
data_pca2 <- prcomp(data_scaled)
summary(data_pca2)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.7689 1.3887 1.0545 0.99466 0.94135 0.83851
## Proportion of Variance 0.3129 0.1929 0.1112 0.09894 0.08861 0.07031
## Cumulative Proportion 0.3129 0.5058 0.6170 0.71590 0.80451 0.87482
## PC7 PC8 PC9 PC10
## Standard deviation 0.74817 0.63135 0.47097 0.26760
## Proportion of Variance 0.05598 0.03986 0.02218 0.00716
## Cumulative Proportion 0.93080 0.97066 0.99284 1.00000
#above shows first 6 account for 88% variance
#plot
plot(data_pca2)
# scree plot
plot(data_pca2, type = "lines")
#biplot
biplot(data_pca2, col = c("gray", "black"))
fviz_pca_var(data_pca2,
col.var = "contrib", # Color by contributions
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
Above shows first 7 account for 90% variance
Please note: PCA didnt really didnt help at all because of the number of components. We were hoping for a reduction down to 3 - 5 components. stick with scaled data
datatocluster <- data_scaled
Silhouette method recommends 2
fviz_nbclust(datatocluster, kmeans, method = "silhouette")+
labs(subtitle = "Silhouette method")
Elbow method seems to show elbow at 4 or 5
fviz_nbclust(datatocluster, kmeans, method = "wss")+
labs(subtitle = "Elbow method")
Gap statistic recommends 1
#gap stat takes some time, uncomment to run it
set.seed(123)
#fviz_nbclust(datatocluster, kmeans, nstart = 25, method = "gap_stat", nboot = 50) + labs(subtitle = "Gap statistic method")
#euclidean takes some time, uncomment to run it
#lets also try nbclust using hierarchical clustering
#recommends 4
#NbClust(data = datatocluster, diss = NULL, distance = "euclidean",min.nc = 2, max.nc = 15, method = "ward.D")
Optimal clusters is 4
k=4;
fit <- kmeans(datatocluster, k, nstart=25, iter.max=200)
fit
## K-means clustering with 4 clusters of sizes 320, 382, 300, 597
##
## Cluster means:
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 -0.57414243 -0.6042749 0.02563199 -0.3027505 -0.83944657
## 2 1.29529413 -0.6887925 1.15928827 0.2488172 0.38235662
## 3 -0.03712447 0.3360568 0.07558678 0.5274614 0.43915001
## 4 -0.50241112 0.5957616 -0.79351154 -0.2619873 -0.01538079
## total.sulfur.dioxide density pH sulphates alcohol
## 1 -0.2161077 -1.2115084 0.4736841 0.3034858 1.2406304
## 2 -0.4441063 0.8315168 -0.8798589 0.5862033 0.1084757
## 3 1.4643972 0.4837596 -0.3572738 -0.1550951 -0.7100370
## 4 -0.3358729 -0.1257699 0.4886254 -0.4598268 -0.3776019
##
## Clustering vector:
## [1] 4 3 4 2 4 4 4 4 4 3 4 3 4 4 3 3 3 3 4 2 3 3 2 3 4 4 4 2 4 4 4 4 3 3
## [35] 4 4 4 4 4 3 3 3 2 4 4 1 3 2 4 3 4 4 4 3 3 3 2 3 4 4 3 3 4 4 4 4 4 4
## [69] 2 4 4 3 3 4 2 2 2 4 4 3 4 2 3 3 1 4 3 4 3 4 3 3 3 4 4 1 4 4 4 4 4 4
## [103] 4 4 4 4 2 4 3 3 3 3 3 2 3 2 4 4 4 4 3 4 4 4 3 3 4 4 1 4 3 1 1 4 4 3
## [137] 4 4 3 3 3 4 1 4 1 3 4 3 4 4 1 2 4 4 3 3 3 3 4 4 4 4 4 3 3 3 3 4 4 3
## [171] 4 4 4 4 4 4 4 4 4 4 4 3 4 4 4 3 3 4 3 3 3 4 3 4 4 3 4 2 1 4 2 3 4 4
## [205] 4 2 2 3 3 2 1 4 2 4 4 3 4 4 4 3 3 3 4 4 4 4 3 4 4 4 1 4 3 4 4 4 4 4
## [239] 4 4 2 2 3 2 2 4 4 3 4 4 2 4 2 3 4 3 2 4 2 2 4 4 4 4 2 2 4 1 4 2 3 2
## [273] 2 4 3 3 4 2 2 2 2 2 4 2 3 3 2 4 4 2 4 2 2 4 2 2 3 4 4 4 4 2 4 4 3 2
## [307] 4 2 2 4 2 3 3 3 1 1 3 3 2 3 2 3 4 3 3 3 2 2 2 2 2 2 3 4 4 2 1 3 2 2
## [341] 2 2 2 2 2 4 1 2 2 4 2 4 4 2 1 1 2 2 2 2 3 2 2 2 2 2 2 2 2 2 4 2 2 3
## [375] 2 2 2 2 2 2 2 2 2 2 4 4 4 4 4 2 1 2 2 3 2 2 3 2 2 4 3 1 2 2 4 1 2 2
## [409] 2 2 3 3 3 2 3 3 2 3 2 4 2 4 4 2 4 4 1 4 4 2 2 3 2 2 2 2 4 2 2 4 2 2
## [443] 2 2 1 4 2 2 4 2 2 2 4 2 1 2 4 3 2 2 2 4 2 3 2 2 2 1 2 4 2 2 2 2 2 2
## [477] 2 2 2 4 2 2 2 2 2 4 4 4 2 2 3 1 1 3 1 2 4 3 2 3 4 2 2 2 2 2 2 2 3 2
## [511] 2 3 2 2 2 3 2 2 2 3 2 4 3 3 3 3 3 1 3 3 2 2 2 1 2 2 3 4 2 2 3 2 4 2
## [545] 2 3 4 2 2 2 4 2 2 1 2 2 2 2 2 2 2 3 3 3 2 2 4 4 2 1 2 1 2 2 2 2 4 3
## [579] 3 2 2 2 2 2 2 4 2 3 1 2 3 1 3 2 4 3 2 2 4 2 4 2 4 2 3 4 2 3 3 1 2 2
## [613] 4 1 3 3 3 2 2 2 3 3 4 1 3 3 4 4 4 3 4 2 4 3 3 4 3 3 4 2 2 3 2 3 2 4
## [647] 4 4 1 3 2 3 2 2 2 3 2 2 4 4 4 4 4 2 2 3 2 2 2 2 4 4 3 4 2 2 2 4 3 2
## [681] 2 4 3 4 3 4 4 3 4 2 4 3 2 3 3 1 4 4 3 2 3 4 4 3 4 3 4 4 1 2 3 3 4 4
## [715] 3 4 4 4 4 4 4 3 4 3 4 4 4 4 4 4 2 4 4 3 4 4 4 4 3 4 4 3 4 2 3 4 3 3
## [749] 4 4 4 4 3 4 2 4 4 4 4 3 3 4 4 4 4 3 3 3 4 4 4 3 3 2 2 4 4 4 4 4 4 4
## [783] 3 4 4 2 2 3 3 3 3 3 4 4 1 2 3 1 2 2 3 4 1 4 4 1 1 1 4 4 4 2 2 1 2 2
## [817] 2 2 4 3 4 1 4 4 4 4 1 4 1 1 4 1 2 2 4 4 1 1 2 4 2 4 2 3 2 4 4 4 4 4
## [851] 2 2 3 1 1 4 1 1 2 1 4 1 4 4 4 4 1 1 1 4 1 4 4 1 2 2 4 1 3 4 4 4 2 4
## [885] 3 4 4 2 4 3 1 4 2 4 4 4 1 4 1 4 1 4 4 4 4 3 3 4 4 1 2 2 1 2 1 1 4 3
## [919] 1 1 2 1 1 3 1 1 2 3 1 1 4 4 4 4 4 1 1 2 1 1 1 2 2 2 1 2 2 1 1 1 1 1
## [953] 1 1 1 2 2 2 4 4 1 4 4 2 1 1 2 3 1 4 2 2 2 1 2 4 4 3 1 2 4 4 1 4 2 4
## [987] 1 4 4 2 4 4 4 4 3 4 1 1 4 1 1 2 1 1 4 1 1 1 2 2 1 2 4 4 4 2 1 1 1 4
## [1021] 2 2 4 1 4 4 1 4 4 4 4 4 4 4 4 2 1 4 1 1 4 4 1 2 1 1 4 4 2 2 4 2 1 1
## [1055] 3 3 2 3 2 2 2 1 1 2 1 4 1 2 2 3 1 3 3 4 3 2 2 2 2 1 1 1 4 1 4 4 1 1
## [1089] 2 2 2 1 1 1 4 2 4 4 1 4 1 1 1 1 1 1 1 1 4 2 4 1 1 2 1 4 4 4 1 1 1 1
## [1123] 1 2 4 1 1 4 3 2 4 1 1 4 1 1 2 2 3 3 3 1 1 1 1 1 4 2 1 1 1 1 4 2 4 4
## [1157] 1 1 1 2 2 2 1 4 4 2 2 1 1 1 2 4 1 4 4 4 4 1 1 1 1 1 2 4 3 1 4 1 3 4
## [1191] 2 4 1 4 4 4 3 4 1 3 4 1 1 3 1 1 1 2 1 1 4 4 4 2 2 1 3 1 1 1 2 2 3 2
## [1225] 2 3 3 4 1 4 1 4 4 2 1 3 4 1 4 1 4 2 1 3 3 4 4 4 1 4 4 4 4 4 4 4 4 3
## [1259] 4 4 2 4 3 4 1 4 4 2 4 1 1 1 1 4 4 3 1 4 3 1 4 4 4 3 1 2 1 1 3 3 4 4
## [1293] 1 4 4 3 3 1 1 4 1 4 1 1 3 3 3 4 3 4 3 1 4 4 4 3 1 2 3 2 3 1 1 1 1 1
## [1327] 1 1 4 3 3 3 4 4 4 1 4 4 4 4 4 4 4 4 2 4 1 4 4 4 3 1 4 4 4 4 4 1 3 2
## [1361] 2 4 2 4 4 4 4 3 3 4 3 2 3 3 4 3 4 1 4 4 4 4 3 3 3 3 4 4 3 3 1 4 4 4
## [1395] 3 4 4 3 4 4 3 3 1 1 4 1 2 1 1 1 4 1 2 3 2 4 2 1 4 3 4 4 1 4 2 2 1 1
## [1429] 4 1 1 3 1 1 3 3 3 4 4 4 1 3 4 1 4 3 4 4 4 1 1 2 1 3 2 4 1 3 1 1 4 4
## [1463] 4 4 4 4 4 4 4 4 4 1 1 1 3 1 3 1 4 2 4 2 4 1 4 4 4 4 1 1 1 1 1 3 1 1
## [1497] 3 4 4 4 4 3 4 1 1 4 4 1 2 1 1 4 4 1 3 3 1 1 4 4 1 4 1 4 1 4 4 4 2 4
## [1531] 1 4 4 3 1 4 4 4 1 1 1 1 4 2 1 4 4 1 2 1 4 4 4 4 4 1 4 4 3 3 3 3 4 4
## [1565] 4 1 1 4 4 1 1 1 3 1 3 1 1 4 4 1 1 1 1 3 1 1 1 1 1 3 1 4 1 4 4 1 1 4
## [1599] 1
##
## Within cluster sum of squares by cluster:
## [1] 2097.204 2876.922 2025.222 2940.944
## (between_SS / total_SS = 37.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# Cluster sizes
sort(table(fit$cluster))
##
## 3 1 2 4
## 300 320 382 597
datatovisualize1 <- data_scaled
#clusplot below, but is not useful as we performed analysis in 9 dimensions
#clusplot below uses first 2 dimensions which covers 47% variability
#too much overlap in 2d
#should really plot pairs or each variable against cluster
clusplot(datatovisualize1, fit$cluster,cex=1,xlab=colnames(datatovisualize1)[1],ylab=colnames(datatovisualize1)[2],col.p=fit$cluster,lines=0,labels=1)
#pairwise plot
#pairwise keeps running and crashes
#uncomment if need to run
#pairs(datatovisualize1, col=c("red","blue","green","yellow")[fit$cluster])
#put quality back in data
datatovisualize1$quality <- raw$quality
old.par <- par(mar = c(2, 2, 2, 2))
par(mfrow=c(2,2))
for(i in 1:10){
boxplot(datatovisualize1[,i] ~ fit$cluster,
xlab='Cluster Number', ylab=colnames(datatovisualize1)[i],
main=paste('Clusters of ', as.character(colnames(datatovisualize1)[i])))
}
#reset graphics
par(old.par)
#cluster 3 is higher quality than others
#why is that?
# cluster 3: alcohol, pH are higher
# cluster 3: sugar, chlorides, density are lower
#put cluster back into raw data so we can save raw data and show in shiny app
raw$cluster_kmeans=fit$cluster
We implement a Ward’s hierarchical clustering procedure:
#distance matrix
d <- dist(data_scaled, method = "euclidean")
#clustering
h_clust <- hclust(d, method = "ward.D2")
Display dendrogram with cut rects
plot(h_clust)
rect.hclust(h_clust , k = 5, border = 2:8)
abline(h = 5, col = 'red')
Display clusters in different colors
avg_dend_obj <- as.dendrogram(h_clust)
avg_col_dend <- color_branches(avg_dend_obj, h = 40)
plot(avg_col_dend)
Extract clusters
groups <- cutree(h_clust,k=5)
Count number of instances in each group
data_df_cl <- mutate(data, cluster = groups)
count(data_df_cl, cluster)
## # A tibble: 5 x 2
## cluster n
## <int> <int>
## 1 1 526
## 2 2 361
## 3 3 382
## 4 4 153
## 5 5 177
We can analysis the quality and
summary(data_df_cl)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. :0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.:1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median :2.200
## Mean : 8.32 Mean :0.5243 Mean :0.271 Mean :2.463
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.:2.600
## Max. :15.90 Max. :1.0100 Max. :1.000 Max. :5.100
## chlorides total.sulfur.dioxide density pH
## Min. :0.04100 Min. : 6.00 Min. :0.9923 Min. :2.93
## 1st Qu.:0.07000 1st Qu.: 22.00 1st Qu.:0.9956 1st Qu.:3.21
## Median :0.07900 Median : 38.00 Median :0.9968 Median :3.31
## Mean :0.08167 Mean : 45.37 Mean :0.9967 Mean :3.31
## 3rd Qu.:0.09000 3rd Qu.: 62.00 3rd Qu.:0.9978 3rd Qu.:3.40
## Max. :0.12610 Max. :122.00 Max. :1.0010 Max. :3.68
## sulphates alcohol cluster
## Min. :0.3300 Min. : 8.40 Min. :1.000
## 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:1.000
## Median :0.6200 Median :10.20 Median :2.000
## Mean :0.6472 Mean :10.41 Mean :2.433
## 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:3.000
## Max. :0.9900 Max. :13.50 Max. :5.000
ggplot(data_df_cl, aes(x=density, y = residual.sugar , color = factor(cluster))) + geom_point()
clusplot(data_df_cl, fit$cluster,cex=1,xlab=colnames(data_df_cl)[1],ylab=colnames(data_df_cl)[2],col.p=groups,lines=0,labels=1)
datatovisualize2 <- data_scaled
2D representation of the Segmentation:
clusplot(datatovisualize2, groups, main='Clusters')
# pairwise plot
# pairs(datatovisualize2, col=c("red","blue","green","yellow")[groups])
#put quality back in data
datatovisualize2$quality <- raw$quality
par(old.par)
par(mfrow=c(2,2))
for(i in 1:10){
boxplot(datatovisualize2[,i] ~ groups,
xlab='Cluster Number', ylab=colnames(datatovisualize2)[i],
main=paste('Clusters of ', as.character(colnames(datatovisualize2)[i])))
}
#reset graphics
par(old.par)
#cluster 4 is higher quality than others
#why is that?
# cluster 4: alcohol is higher
# cluster 4: chlorides, density are lower
#put cluster back into raw data so we can save raw data and show in shiny app
raw$cluster_hclust=groups
#TODO save data to file to load in shiny
save(raw, file = "./WineAnalysis/shiny.RData")